工作中需要对word文档中的标题进行提取,作个笔记记录一下
思想比较简单:通过正则表达式匹配
用到正则表达式: public static Pattern titlePattern = Pattern.compile("(?<=“|\").+?(?=”|\")|(?<=“|\"|‘).+?(?=”|\"|’)");
public static List<String> exactTitle(String file){
String[] paraText =readWord(file);
List<String> result = new ArrayList<String>();
for(String text:paraText){
if(text.replaceAll("[\\s\r\n|\n]", "").length()>0){
Matcher m = titlePattern.matcher(text);
if(m.find()){
result.add(m.group());
}
}
}
return result;
}
private static String[] readWord(String file) {
String[] content = null;
InputStream is = null;
try {
is = new FileInputStream(file);
int index = file.indexOf(".");
if (index != -1) {
String endWith = file.substring(index);
if (endWith.equalsIgnoreCase(".doc")) {
content = readWordDoc(is);
} else {
content = readWordDocx(is);
}
}else{
System.out.println("文件格式不是word格式!");
}
} catch (IOException e) {
throw new IllegalArgumentException(file + "文件不存在");
}
return content;
}
//下面是利用POI读取word
/**
* 利用XWPF读取
*
* @param file
* @return
* @throws IOException
*/
private static String[] readWordDocx(InputStream is) throws IOException {
XWPFDocument xwpf = new XWPFDocument(is);
XWPFWordExtractor xwordExtractor = new XWPFWordExtractor(xwpf);
String source = xwordExtractor.getText();
String[] paraText = source.split("\r\n|\n");
return paraText;
}
/**
* 利用HWPF按照段落读取文本
*
* @param file
* @return
* @throws IOException
*/
public static String[] readWordDoc(InputStream is) throws IOException {
HWPFDocument hwpf = new HWPFDocument(is);
WordExtractor wordExtractor = new WordExtractor(hwpf);
return wordExtractor.getParagraphText();
}